%matplotlib inline
import warnings
import os
import pandas as pd
import numpy as np
from ipyleaflet import Map, Marker, GeoJSON, Circle
from sklearn.linear_model import LinearRegression
from matplotlib.colors import LogNorm, rgb2hex
from numpy.linalg.linalg import LinAlgError
from scipy.stats import binned_statistic_2d
from geojson import Polygon, Feature, Point
from xgboost import XGBRegressor
from matplotlib.cm import hot_r
from itertools import product
from datetime import datetime
from pylab import rcParams
from scipy import stats
from tqdm import tqdm
from matplotlib import pyplot as plt
from statsmodels import api as sm
rcParams['figure.figsize'] = 15, 8
warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data'
data = pd.read_csv(os.path.join(PATH_TO_DATA, 'yellow_tripdata_2016-05.csv'), parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
regions = pd.read_csv(os.path.join(PATH_TO_DATA, 'regions.csv'), index_col=0, delimiter=';')
print(data.shape)
data.head()
# bottom x left and top x right
longitude_bottom = -74.25559
longitude_top = -73.70001
latitude_left = 40.49612
latitude_right = 40.91553
def clean_data(df):
query = 'passenger_count != 0 & trip_distance != 0 & tpep_pickup_datetime != tpep_dropoff_datetime & '+ \
' pickup_longitude > {} & pickup_longitude < {} & pickup_latitude > {} & pickup_latitude < {} '\
.format(longitude_bottom, longitude_top, latitude_left, latitude_right)
df = df.query(query)
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].apply(lambda x: x.replace(minute=0, second=0))
return df
%%time
data = clean_data(data)
print(data.shape)
%%time
binx=np.sort(regions.west.unique())[1:]
biny=np.sort(regions.south.unique())[1:]
x = data.pickup_longitude.values
y = data.pickup_latitude.values
_, _, _, regions_ids = binned_statistic_2d(x, y, regions, 'count', bins=[binx, biny])
regions_ids = regions_ids + 1
data['region'] = regions_ids
%%time
all_times = data['tpep_pickup_datetime'].view('int64')
binx = regions.index.values.tolist() + [2501]
biny = np.sort(all_times.unique()).tolist() + [np.datetime64('2016-06-01T00:00:00.000000000').view('int64')]
x = data.region.values
y = all_times
matrix, _, _, _ = binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
aggregated_df = pd.DataFrame(matrix, index=regions.index.values, columns= np.sort(data['tpep_pickup_datetime'].unique()))
latitude = 40.748817
longitude = -73.985428
query = 'west < {} & east > {} & south < {} & north > {}'.format(longitude, longitude, latitude, latitude)
region = regions.query(query).index.values[0]
print(region)
_ = plt.figure(figsize = (15,5))
aggregated_df.loc[region].plot(kind="line",color="r")
plt.title('Count of rides per hour of a day')
plt.xlabel('Date')
_ = plt.ylabel('Count of riders')
all_cells = aggregated_df.shape[0] * aggregated_df.shape[1]
empty_cells = matrix[matrix == 0].shape[0]
filled_cells = all_cells - empty_cells
pers = filled_cells / all_cells * 100
'All: {}, empty: {}, with rides: {}. Fill percentage: {}'.format(all_cells, empty_cells, filled_cells, pers)
data = aggregated_df
del aggregated_df
sums = data.sum(axis=1)
sums = sums[sums == 0]
sums.shape
m = Map(center=[40.748817, -73.985428], zoom=10)
mark = Marker(location=[40.748817, -73.985428])
mark.visible
m += mark
m
def create_rectangle(row, color, opacity):
x1, x2, y1, *y2 = row
coordinates = [[[x1, y1],[x1, y2],[x2, y2],[x2, y1]]]
poly = Polygon(coordinates)
prop = {"style":{"color":'black', "fillColor":color, "fillOpacity":opacity, "opacity":0.5, "weight":1}}
feature = Feature(geometry=poly, properties=prop)
return feature
def get_heat_map(keys):
colors =[rgb2hex(d[0:3]) for d in hot_r(LogNorm(vmin=1, vmax=1000000)(keys.unique()))]
color_dict = {k: v for k, v in zip(keys.unique(), colors)}
m = Map(center=[40.708817, -73.985428], zoom=10)
for region in tqdm(range(1, 2501)):
row = regions.loc[[region]].values[0]
if region in keys.index:
color = color_dict[keys[region]]
opacity = 0.0 if keys[region] == 0 else 0.5
else:
color = '#FFFFFF'
opacity = 0.0
feature = create_rectangle(row, color, opacity)
m += GeoJSON(data=feature)
return m
get_heat_map(data.sum(axis=1))
m3 = Map(center=[40.708817, -73.985428], zoom=10)
m3 += Circle(location=[40.689249, -74.044500])
m3
get_heat_map(data.mean(axis=1))
means = data.mean(axis=1)
means = means[means >= 5]
print(means.shape)
get_heat_map(means)
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_1282.csv'), index_col=0, parse_dates=[0])
print(train_df.shape)
train_df.head()
_ = train_df.plot(figsize=(15,5))
_, ax = plt.subplots(2, 1, figsize=(15, 8))
sm.graphics.tsa.plot_acf(train_df['1282'].values.squeeze(), lags=48, ax=ax[0])
sm.graphics.tsa.plot_pacf(train_df['1282'].values.squeeze(), lags=48, ax=ax[1])
plt.show()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df['1282'])[1])
hours_in_day = 24
sin_day = np.sin(np.array(range(1, hours_in_day + 1)) * 2 * np.pi / (hours_in_day))
sin_day = np.tile(sin_day, int(train_df.shape[0] / len(sin_day) + 1))
hours_in_week = 7 * 24
sin_week = np.sin(np.array(range(1, hours_in_week + 1)) * 2 * np.pi / (hours_in_week))
sin_week = np.tile(sin_week, int(train_df.shape[0] / len(sin_week) + 2))
fig = plt.figure(figsize=(15,5))
plt.plot(sin_week[:hours_in_week], label = 'week')
plt.plot(sin_day[:hours_in_week], label = 'day')
plt.title('Синусы')
_ = plt.legend()
days = {'sin_day_%d' % i: sin_day[i: train_df.shape[0] + i] for i in range(hours_in_day)}
weeks = {'sin_week_%d' % i: sin_week[i: train_df.shape[0] + i] for i in range(hours_in_week)}
values = {**days, **weeks}
sin_df = pd.DataFrame(values)
y = train_df['1282'].values
X = sin_df.values
y.shape, X.shape
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
residuals = y - y_pred
train_df['residuals'] = residuals
_ = train_df['residuals'].plot(figsize=(20,5))
plt.figure(figsize=(15,10))
sm.tsa.seasonal_decompose(train_df.residuals.asfreq('H')).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.residuals)[1])
train_df['dif'] = train_df.residuals - train_df.residuals.shift(hours_in_week)
sm.tsa.seasonal_decompose(train_df.dif[hours_in_week:].asfreq('H')).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(train_df.dif[hours_in_week:])[1])
_, ax = plt.subplots(2, 1, sharex=True)
sm.graphics.tsa.plot_acf(train_df.dif[hours_in_week:].values.squeeze(), lags=48, ax=ax[0])
sm.graphics.tsa.plot_pacf(train_df.dif[hours_in_week:].values.squeeze(), lags=48, ax=ax[1])
plt.show()
p = range(2)
q = range(3, 4)
P = range(1)
Q = range(2)
d, D = 1, 1
parameters = list(product(p, q, P, Q))
len(parameters)
%%time
results = []
best_aic = float("inf")
for param in parameters:
try:
model=sm.tsa.statespace.SARIMAX(train_df.residuals, order=(param[0], d, param[1]),
seasonal_order=(param[2], D, param[3], 12)).fit(disp=-1)
except LinAlgError:
continue
except ValueError:
continue
aic = model.aic
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results.append([param, model.aic])
result_df = pd.DataFrame(results)
result_df.columns = ['parameters', 'aic']
result_df.sort_values(by = 'aic', ascending=True).head()
best_model.summary()
def get_features(df, K = 20):
length = df.shape[0] + 1
features = pd.DataFrame()
for i in range(1,K+1):
features['sin_%d' % i] = [ np.sin(t * 2. * i * np.pi / 168.) for t in np.arange(1, length)]
features['cos_%d' % i] = [ np.cos(t * 2. * i * np.pi / 168.) for t in np.arange(1, length)]
features[['h%d' % i for i in range(0, 24)]] = pd.get_dummies(df.index.hour)
features[['dw%d' % i for i in range(0, 7)]] = pd.get_dummies(df.index.weekday)
features[['dm%d' % i for i in range(0, 31)]] = pd.get_dummies(df.index.day)
features['is_weekend'] = (df.index.weekday > 4).astype(float)
return features
def rolling_window(a, window):
a = np.concatenate((np.zeros(window), a), axis=0)
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[:-1]
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col=0, parse_dates=[0])
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_05.csv'), index_col=0, parse_dates=[0])
kaggle_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_06.csv'), index_col=0, parse_dates=[0])
concat_df = pd.concat((train_df, test_df, kaggle_df))
print(concat_df.shape)
concat_df.head()
destination_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'small_destination.csv'), index_col=0, parse_dates=[0])
print(destination_df.shape)
destination_df.head()
feature_df = get_features(concat_df)
print(feature_df.shape)
feature_df.head()
values = [rolling_window(concat_df[c], 5) for c in train_df.columns]
previous_rodes = pd.Series(values, index=train_df.columns)
values = [rolling_window(destination_df[c], 5) for c in train_df.columns]
destination_rodes = pd.Series(values, index=train_df.columns)
test_prediction_df = pd.DataFrame()
train_size = train_df.shape[0]
kaggle_size = kaggle_df.shape[0]
for c in tqdm(train_df.columns):
X = np.concatenate((feature_df[:train_size].values, previous_rodes[c][:train_size]), axis=1)
y = train_df[c]
X_test = np.concatenate((feature_df[train_size:-kaggle_size].values, previous_rodes[c][train_size:-kaggle_size]), axis=1)
model = XGBRegressor().fit(X, y)
test_prediction_df[c] = model.predict(X_test)
error = 0
size_of_predictions = 6 * 739 * test_prediction_df.shape[1]
for c in tqdm(test_prediction_df.columns):
error += sum(sum(abs(rolling_window(test_prediction_df[c], 6) - rolling_window(test_df[c], 6)))) / size_of_predictions
error